In [1]:
from bertopic import BERTopic

topic_model = BERTopic.load("/home/zhhuang/climate_policy_paper/code/model_save/bert_topic_iea_cp_cclw_model")
/home/zhhuang/anaconda3/envs/climatepolicy/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
In [2]:
import pandas as pd

df = pd.read_excel("/home/zhhuang/climate_policy_paper/code/data/Topic_docs_time_iea_cp_cclw.xlsx")
docs, timestamp = df["docs"].to_list(), df["Year"].to_list()
In [3]:
topic_model.get_topic_info()['Topic']
topic_model.get_document_info(docs)[["Topic", "Name", "Top_n_words", "Probability", "Representative_document"]]
Out[3]:
Topic Name Top_n_words Probability Representative_document
0 -1 -1_energy_project_development_support energy - project - development - support - emi... 0.132439 False
1 -1 -1_energy_project_development_support energy - project - development - support - emi... 0.077007 False
2 -1 -1_energy_project_development_support energy - project - development - support - emi... 0.695282 False
3 -1 -1_energy_project_development_support energy - project - development - support - emi... 0.125002 False
4 0 0_energy_climate_development_emission energy - climate - development - emission - ma... 0.327378 False
... ... ... ... ... ...
9883 28 28_transparent_generation_reliable_competition transparent - generation - reliable - competit... 1.000000 False
9884 -1 -1_energy_project_development_support energy - project - development - support - emi... 0.697941 False
9885 6 6_biofuels_biofuel_biodiesel_ethanol biofuels - biofuel - biodiesel - ethanol - die... 0.261042 False
9886 11 11_carbon_storage_energy_efficiency carbon - storage - energy - efficiency - innov... 0.209579 False
9887 0 0_energy_climate_development_emission energy - climate - development - emission - ma... 0.398140 False

9888 rows × 5 columns

In [4]:
counts = {}
for doc in docs:
    for word in doc.split():
        counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(100):
    word, count = items[i]
    print("{0:<10}{1:>5}".format(word, count))
energy    24258
renewable  5814
efficiency 5782
emission   5445
project    4847
plan       4715
development 4620
electricity 4287
system     3964
climate    3949
building   3882
sector     3818
power      3782
national   3731
support    3443
gas        3369
policy     3201
target     3009
measure    3008
technology 2991
vehicle    2941
reduce     2899
include    2861
fuel       2853
standard   2834
government 2781
change     2620
programme  2563
increase   2511
tax        2409
heat       2402
source     2389
set        2385
production 2301
promote    2260
public     2253
provide    2251
reduction  2230
investment 2174
strategy   2124
requirement 2094
consumption 2073
environmental 2049
solar      2008
establish  1996
capacity   1977
aim        1973
program    1942
sustainable 1918
management 1901
carbon     1885
industry   1881
level      1878
transport  1859
develop    1817
action     1814
resource   1807
generation 1801
cost       1738
electric   1708
``         1695
market     1693
company    1665
improve    1658
fund       1650
plant      1636
air        1615
scheme     1607
wind       1600
product    1537
supply     1522
water      1499
green      1488
objective  1483
total      1466
build      1457
base       1443
equipment  1427
implementation 1427
implement  1409
service    1408
efficient  1403
country    1399
activity   1385
grant      1362
require    1357
achieve    1344
greenhouse 1317
goal       1288
ensure     1271
''         1270
environment 1264
installation 1264
performance 1245
natural    1229
forest     1226
framework  1221
facility   1214
local      1181
tariff     1173
In [5]:
similar_topics, similarity = topic_model.find_topics("Transport", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[5]:
[('transparent', 0.07180454680384146),
 ('generation', 0.03550348292371022),
 ('reliable', 0.03417482361218522),
 ('competition', 0.03389988982837627),
 ('electrical', 0.02941211389437091),
 ('flexible', 0.02925387541205265),
 ('transmission', 0.028266950447326415),
 ('affordable', 0.0254538800551447),
 ('financially', 0.02472697203215678),
 ('wholesale', 0.02128862307104659)]
In [6]:
similar_topics, similarity = topic_model.find_topics("Industry", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[6]:
[('vehicle', 0.09872622440931626),
 ('fuel', 0.03216303864189669),
 ('emission', 0.02477394343334291),
 ('mobility', 0.021767777207266862),
 ('passenger', 0.020463693943990093),
 ('purchase', 0.01845449245252215),
 ('hybrid', 0.017259508077390077),
 ('truck', 0.01231378011249428),
 ('government', 0.011820709804332022),
 ('battery', 0.010923226473897669)]
In [7]:
similar_topics, similarity = topic_model.find_topics("Energy systems", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[7]:
[('pv', 0.10658574246167805),
 ('photovoltaic', 0.06793013138935346),
 ('certification', 0.057393732829111865),
 ('geothermal', 0.03734211062109081),
 ('photovoltaics', 0.031011521070597656),
 ('manufacturing', 0.02982326407303994),
 ('irradiance', 0.026549480326347224),
 ('manufacture', 0.025380053496348042),
 ('intelligent', 0.024372310694128513),
 ('generation', 0.02249437089718655)]
In [8]:
similar_topics, similarity = topic_model.find_topics("Buildings", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[8]:
[('building', 0.11855952002243023),
 ('residential', 0.044670908054186095),
 ('requirement', 0.038122615955129904),
 ('energy', 0.03736539585568927),
 ('certificate', 0.026989005080772135),
 ('mandatory', 0.017764517473103085),
 ('renovation', 0.016992133155504307),
 ('insulation', 0.01656538204714158),
 ('certification', 0.015874596419938086),
 ('measure', 0.012512507170089183)]
In [9]:
similar_topics, similarity = topic_model.find_topics("Agriculture, Forestry and Other Land Use", top_n=5)
topic_model.get_topic(similar_topics[0])
Out[9]:
[('petroleum', 0.16396635427164843),
 ('exploitation', 0.12613048244983324),
 ('exploration', 0.12022723651321504),
 ('regulate', 0.06417742232836783),
 ('geothermal', 0.04750483692936236),
 ('prohibit', 0.03577380577199707),
 ('hydrocarbon', 0.031090741237797447),
 ('formally', 0.028261385335496114),
 ('hydraulic', 0.027679243464891794),
 ('establishes', 0.02407894665799679)]
In [10]:
len(docs)
Out[10]:
9888
In [11]:
import os
images_path = "/home/zhhuang/climate_policy_paper/paper_images"
if not os.path.exists(images_path):
    os.makedirs(images_path)
In [12]:
import plotly.io as pio
pio.kaleido.scope.default_format = "svg"
# pio.kaleido.scope.mathjax = "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js
In [13]:
fig = topic_model.visualize_barchart(top_n_topics=30, n_words=20, width=300, height=300)
pio.write_image(fig, '/home/zhhuang/climate_policy_paper/paper_images/topic_iea_cp_cclw_barchart.svg')

# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
# fig.write_html("/home/zhhuang/climate_policy_paper/paper_images/topic_barchart.png", engine="kaleido")
# img_bytes = fig.to_image(format="png", width=600, height=350, scale=2)
# Image(img_bytes)
fig
In [14]:
# topic_model.visualize_barchart(top_n_topics = 20, n_words=10,width = 300, height= 300)
In [15]:
fig2 = topic_model.visualize_heatmap()
# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
pio.write_image(fig2, '/home/zhhuang/climate_policy_paper/paper_images/topic_iea_cp_cclw_heatmap.svg')
fig2
In [16]:
fig3 = topic_model.visualize_topics()
pio.write_image(fig3, '/home/zhhuang/climate_policy_paper/paper_images/topic_iea_cp_cclw_visualize_topics.svg')
fig3
In [17]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
# print(hierarchical_topics)
with pd.ExcelWriter("Topic_iea_cp_cclw_hierarchical_topics.xlsx", engine='xlsxwriter',
                    engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
    hierarchical_topics.to_excel(writer)
    
hierarchical_topics = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_iea_cp_cclw_hierarchical_topics.xlsx")
fig4 = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
pio.write_image(fig4, '/home/zhhuang/climate_policy_paper/paper_images/topic_iea_cp_cclw_hierarchical_topics.svg')
fig4
100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [25:58<00:00, 55.66s/it]
In [18]:
for index, i in enumerate(timestamp):
    if i == '0':
        timestamp[index] = '2020'
    else:
        timestamp[index] = str(i)
topics_over_time = topic_model.topics_over_time(docs, timestamp, datetime_format="%Y", nr_bins=20)
with pd.ExcelWriter("Topic_iea_cp_cclw_topics_over_time.xlsx", engine='xlsxwriter',
                    engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
    topics_over_time.to_excel(writer)
19it [4:01:49, 763.67s/it] 
In [19]:
topics_over_time = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_iea_cp_cclw_topics_over_time.xlsx")
# fig5 = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)
fig5 = topic_model.visualize_topics_over_time(topics_over_time)
pio.write_image(fig5, '/home/zhhuang/climate_policy_paper/paper_images/topic_iea_cp_cclw_visualize_topics_over_time.svg')
fig5
In [ ]: